Packages & Read in Data

#Loading pre-installed libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.0.5
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.0.5
library(repr)
## Warning: package 'repr' was built under R version 4.0.5
library(gifski)
## Warning: package 'gifski' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#turning off warnings
options(warn=-1)

#set directory
setwd("/Users/liamd/Documents/NFL Big Data Bowl/Data/")


##reading in non-tracking data

#includes play-by-play info on specific plays
df_plays <- read_csv("plays.csv",
                    col_types = cols())

#includes background info for players
df_players <- read_csv("players.csv",
                      col_types = cols())

head(df_plays)
## # A tibble: 6 x 25
##       gameId playId playDescription       quarter  down yardsToGo possessionTeam
##        <dbl>  <dbl> <chr>                   <dbl> <dbl>     <dbl> <chr>         
## 1 2018090600     37 J.Elliott kicks 65 y~       1     0         0 PHI           
## 2 2018090600    366 (9:20) C.Johnston pu~       1     4         4 PHI           
## 3 2018090600    658 (5:03) M.Bryant 21 y~       1     4         3 ATL           
## 4 2018090600    677 M.Bosher kicks 64 ya~       1     0         0 ATL           
## 5 2018090600    872 (:33) C.Johnston pun~       1     4        18 PHI           
## 6 2018090600    973 (14:18) M.Bosher pun~       2     4        19 ATL           
## # ... with 18 more variables: specialTeamsPlayType <chr>,
## #   specialTeamsResult <chr>, kickerId <dbl>, returnerId <chr>,
## #   kickBlockerId <dbl>, yardlineSide <chr>, yardlineNumber <dbl>,
## #   gameClock <time>, penaltyCodes <chr>, penaltyJerseyNumbers <chr>,
## #   penaltyYards <dbl>, preSnapHomeScore <dbl>, preSnapVisitorScore <dbl>,
## #   passResult <chr>, kickLength <dbl>, kickReturnYardage <dbl>,
## #   playResult <dbl>, absoluteYardlineNumber <dbl>
head(df_players)
## # A tibble: 6 x 7
##   nflId height weight birthDate  collegeName      Position displayName      
##   <dbl> <chr>   <dbl> <chr>      <chr>            <chr>    <chr>            
## 1 42901 6-1       208 1992-07-25 James Madison    SS       Dean Marlowe     
## 2 43501 6-0       220 1994-08-11 Central Michigan FS       Kavon Frazier    
## 3 43534 6-0       205 1993-06-02 Illinois         SS       Clayton Fejedelem
## 4 43535 6-1       235 1992-12-22 Temple           MLB      Tyler Matakevich 
## 5 44174 6-1       236 1993-02-22 Texas Tech       ILB      Sam Eguavoen     
## 6 44931 6-4       221 1993-09-16 North Carolina   WR       Mack Hollins
##Reading tracking data (needs to be done iteratively)

#weeks of NFL season
seasons <- seq(2018, 2020)

#blank dataframe to store tracking data
df_tracking <- data.frame()

#iterating through all weeks
for(s in seasons){
    
    #temperory dataframe used for reading season for given iteration
    df_tracking_temp <- read_csv(paste0("tracking",s,".csv"),
                                col_types = cols())
    
    #storing temporary dataframe in full season dataframe
    df_tracking <- bind_rows(df_tracking_temp, df_tracking)                            
    
}

head(df_tracking)
## # A tibble: 6 x 18
##   time                    x     y     s     a   dis     o   dir event nflId
##   <dttm>              <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 2021-01-03 18:03:02  61.2  46.8  0.08  0.13  0.01  186. 145.  None  42901
## 2 2021-01-03 18:03:02  61.2  46.8  0.11  0.19  0.01  185. 126.  None  42901
## 3 2021-01-03 18:03:02  61.2  46.8  0.1   0.18  0.01  183. 109.  None  42901
## 4 2021-01-03 18:03:02  61.2  46.8  0.11  0.22  0.01  180.  91.6 None  42901
## 5 2021-01-03 18:03:02  61.2  46.8  0.05  0.2   0.01  172. 120.  None  42901
## 6 2021-01-03 18:03:02  61.2  46.8  0.18  0.8   0.01  169. 237.  None  42901
## # ... with 8 more variables: displayName <chr>, jerseyNumber <dbl>,
## #   position <chr>, team <chr>, frameId <dbl>, gameId <dbl>, playId <dbl>,
## #   playDirection <chr>
rm(df_tracking_temp)

Cleaning Data

#Standardizing tracking data so its always in direction of punting team.
df_tracking <- df_tracking %>%
                mutate(x = ifelse(playDirection == "left", 120-x, x),
                       y = ifelse(playDirection == "left", 160/3 - y, y))

#Subset for punts
df_plays_punts <- df_plays %>% 
            filter(specialTeamsPlayType == 'Punt')

#Add kicker and returner names
df_plays_punts <- merge(x = df_plays_punts, y = df_players, by.x = "kickerId", by.y = "nflId", all.x = TRUE)
df_plays_punts <- merge(x = df_plays_punts, y = df_players, by.x = "returnerId", by.y = "nflId", all.x = TRUE)

df_plays_punts <- df_plays_punts %>% 
  rename(kickerName = displayName.x) %>% 
  rename(returnerName = displayName.y)

#Change 'OAK' and 'LV' to be 'OAK/LV' since the Raiders moved
df_plays_punts <- df_plays_punts %>% 
  mutate(possessionTeam = ifelse(possessionTeam == 'OAK', 'OAK_LV', possessionTeam)) %>% 
  mutate(possessionTeam = ifelse(possessionTeam == 'LV', 'OAK_LV', possessionTeam))

#Subset for Panthers
#df_plays_panthers <- df_plays_punts %>% 
#            filter(possessionTeam == 'CAR')

#Subset for Patriots
#df_plays_patriots <- df_plays_punts %>% 
#            filter(possessionTeam == 'NE')

Summary Data on Punt Return Yardage

#---------------------------- TEAMS ----------------------------------
#Average punt return yardage by kicking team (lower value better for the team)
df_avg_punt_team <- df_plays_punts %>% 
  group_by(possessionTeam) %>% 
  summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>% 
  rename(kickingTeam = possessionTeam) %>% 
  arrange(avgPuntReturn) 

## Get obs count where the punt was returned
team_count <- df_plays_punts %>% 
  count(possessionTeam, specialTeamsResult == "Return")

team_count <- team_count %>% 
  filter(`specialTeamsResult == "Return"` == TRUE) %>% 
  select(-`specialTeamsResult == "Return"`)

## Merge datasets
df_avg_punt_team <- merge(x = df_avg_punt_team, y = team_count, by.x = "kickingTeam", by.y = "possessionTeam", all.x = TRUE)

## Plot
ggplot(data = df_avg_punt_team, aes(x = avgPuntReturn, y = n, label = kickingTeam, color = -avgPuntReturn)) + 
  geom_point() + 
  scale_x_reverse() +
  geom_text(aes(label = kickingTeam), hjust = -.3) +
  scale_color_gradientn(colors = rainbow(5)) +
  ggtitle("Average Punt Return Against by Team") +
  ylab("Number of Punts") + 
  xlab("Average Punt Return Against") + 
  theme_minimal()

#---------------------------- KICKERS ----------------------------------#
#Average punt return yardage by kicker (lower value better)
df_avg_punt_kicker <- df_plays_punts %>% 
  group_by(kickerName) %>% 
  summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>% 
  arrange(avgPuntReturn)

df_avg_punt_kicker
## # A tibble: 54 x 2
##    kickerName          avgPuntReturn
##    <chr>                       <dbl>
##  1 Jake Elliott                 0   
##  2 Ryan Winslow                 1.5 
##  3 Drew Kaser                   3.8 
##  4 Thomas Morstead              5.10
##  5 Sterling Hofrichter          5.4 
##  6 Jack Fox                     5.5 
##  7 Dustin Colquitt              5.81
##  8 Logan Cooke                  6.26
##  9 Kevin Huber                  6.34
## 10 Rigoberto Sanchez            6.48
## # ... with 44 more rows
## Get obs count
kicker_count <- df_plays_punts %>% 
  count(kickerName, specialTeamsResult == "Return")

kicker_count <- kicker_count %>% 
  filter(`specialTeamsResult == "Return"` == TRUE) %>% 
  select(-`specialTeamsResult == "Return"`)

## Merge datasets
df_avg_punt_kicker <- merge(x = df_avg_punt_kicker, y = kicker_count, by.x = "kickerName", by.y = "kickerName", all.x = TRUE)

## Plot
kicker_plot <- ggplot(data = df_avg_punt_kicker, aes(x = avgPuntReturn, y = n, text = kickerName, color = -avgPuntReturn)) + 
  geom_point() + 
  scale_x_reverse() +
  scale_color_gradientn(colors = rainbow(5)) +
  ggtitle("Average Punt Return Against by Kicker") +
  ylab("Number of Punts") + 
  xlab("Average Punt Return Against") + 
  theme_minimal()

ggplotly(kicker_plot, tooltip = c("text", "x", "y"))
#---------------------------- RETURNERS ----------------------------------#
#Average punt return yardage by returner (higher value better)
df_avg_punt_returner <- df_plays_punts %>% 
  group_by(returnerName) %>% 
  summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>%
  arrange(desc(avgPuntReturn))

df_avg_punt_returner
## # A tibble: 185 x 2
##    returnerName     avgPuntReturn
##    <chr>                    <dbl>
##  1 Jalen Reagor              31.3
##  2 Corey Coleman             19  
##  3 Keelan Cole               15.6
##  4 Nyheim Hines              15.0
##  5 Gunner Olszewski          13.5
##  6 Brandon Aiyuk             13  
##  7 Cyrus Jones               12.6
##  8 Jakeem Grant              12.5
##  9 Isaiah Rodgers            12  
## 10 Malcolm Perry             12  
## # ... with 175 more rows
## Get obs count
returner_count <- df_plays_punts %>% 
  count(returnerName, specialTeamsResult == "Return")

returner_count <- returner_count %>% 
  filter(`specialTeamsResult == "Return"` == TRUE) %>% 
  select(-`specialTeamsResult == "Return"`)

## Merge datasets
df_avg_punt_returner <- merge(x = df_avg_punt_returner, y = returner_count, by.x = "returnerName", by.y = "returnerName", all.x = TRUE)
df_avg_punt_returner <- na.omit(df_avg_punt_returner)

## Plot
returner_plot <- ggplot(data = df_avg_punt_returner, aes(x = avgPuntReturn, y = n, text = returnerName, color = avgPuntReturn)) + 
  geom_point() + 
  scale_color_gradientn(colors = rainbow(5)) +
  ggtitle("Average Punt Return by Returner") +
  ylab("Number of Punts") + 
  xlab("Average Punt Return") + 
  theme_minimal()

ggplotly(returner_plot, tooltip = c("text", "x", "y"))
# Remove unnecessary variables
rm(list = c('kicker_count', 'team_count', 'returner_count'))